library(ggplot2)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(fpc)
library(dbscan)
##
## Attaching package: 'dbscan'
## The following object is masked from 'package:fpc':
##
## dbscan
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
set.seed(1122)
# We should remove the labels as clustering is an unsupervised algorithm.
#Get rid of "Name"
paste("We should remove the labels as clustering is an unsupervised algorithm. Get rid of 'Name'")
## [1] "We should remove the labels as clustering is an unsupervised algorithm. Get rid of 'Name'"
paste("The data is between 0 and 5. We can standardize this to to get a better understanding of the data, where mean=0, var=1. Will do this after getting the dataset.")
## [1] "The data is between 0 and 5. We can standardize this to to get a better understanding of the data, where mean=0, var=1. Will do this after getting the dataset."
# Here I will manually clean the
data2 <- read.table(file="file19.txt", header = FALSE, sep = "\t", dec = ".")
df <- data2[-(1:3),]
dat_frame <- data.frame(matrix(vector(),ncol=8))
for (i in 2:67){
str <- str_split(df[i], "")
#17 19 21 23 25 27 29 31 for attributes
rowi <- c(strtoi(str[[1]][17]),strtoi(str[[1]][19]),strtoi(str[[1]][21]),strtoi(str[[1]][23]),strtoi(str[[1]][25]),strtoi(str[[1]][27]),strtoi(str[[1]][29]),strtoi(str[[1]][31]))
dat_frame <- rbind(dat_frame, rowi)
}
colnames(dat_frame) <- c("I", "i", "C", "c","P", "p", "M", "m")
#Normalize the data
df <- as.data.frame(scale(dat_frame))
write.csv(df, file="file19saved.csv", row.names=F)
df <- read.csv("file19saved.csv", header=T, sep=",")
df <- as.data.frame(df)
head(df)
## I i C c P p M
## 1 2.6808138 1.3970672 0.8257228 0.9059288 0.3610716 0.4549113 1.3254890
## 2 1.0037170 0.5353248 0.8257228 0.9059288 1.1828206 1.2450204 0.4678196
## 3 1.0037170 -0.3264176 0.8257228 -1.0871146 0.3610716 0.4549113 0.4678196
## 4 1.0037170 0.5353248 0.8257228 0.9059288 1.1828206 1.2450204 0.4678196
## 5 0.1651686 0.5353248 0.8257228 0.9059288 0.3610716 0.4549113 0.4678196
## 6 0.1651686 0.5353248 0.8257228 0.9059288 -0.4606775 0.4549113 0.4678196
## m
## 1 1.3404041
## 2 0.3574411
## 3 0.3574411
## 4 0.3574411
## 5 0.3574411
## 6 0.3574411
fviz_nbclust(df, kmeans, method = "silhouette")
#fviz_nbclust(df, kmeans, method = "wss")
k <- kmeans(df, centers=8)
paste("looking at silhoutte graph gives 8 clusters.")
## [1] "looking at silhoutte graph gives 8 clusters."
fviz_cluster(k, df, main="K-means Cluster with k=8")
k$cluster
## [1] 7 7 7 7 1 1 1 1 1 1 1 8 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 7 7 7 7 4 7 4
## [39] 4 2 2 2 4 2 2 4 2 2 2 2 2 4 4 5 2 5 1 6 6 6 6 6 6 6 6 6
paste("There are this many points in each clusters: 1: 8, 2: 11, 3: 19, 4: 7, 5: 2, 6: 9, 7: 9, 8: 1")
## [1] "There are this many points in each clusters: 1: 8, 2: 11, 3: 19, 4: 7, 5: 2, 6: 9, 7: 9, 8: 1"
paste("total SSE-WSS of the clusters:")
## [1] "total SSE-WSS of the clusters:"
k$tot.withinss
## [1] 55.12383
paste("SSEs for each clusters")
## [1] "SSEs for each clusters"
k$withinss
## [1] 4.244696 5.932404 15.568186 3.606768 2.223560 6.337449 17.210772
## [8] 0.000000
paste("SSE in each clusters: 1: 4, 2: 6, 3: 16, 4: 4, 5: 2, 6: 6, 7: 17, 8: 0")
## [1] "SSE in each clusters: 1: 4, 2: 6, 3: 16, 4: 4, 5: 2, 6: 6, 7: 17, 8: 0"
which(k$cluster==1)
## [1] 5 6 7 8 9 10 11 57
paste("all of these are mostly bat types of animals")
## [1] "all of these are mostly bat types of animals"
which(k$cluster==2)
## [1] 40 41 42 44 45 47 48 49 50 51 55
paste("This cluster is cut into two with mole-type animals and cougar type animals.")
## [1] "This cluster is cut into two with mole-type animals and cougar type animals."
which(k$cluster==3)
## [1] 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
paste("mostly weasel-squirrel type of animals. However there are a lot of variety. This cluster needs improvement.")
## [1] "mostly weasel-squirrel type of animals. However there are a lot of variety. This cluster needs improvement."
which(k$cluster==4)
## [1] 36 38 39 43 46 52 53
paste("these are wild-cat like animals. Good job in clustering")
## [1] "these are wild-cat like animals. Good job in clustering"
which(k$cluster==5)
## [1] 54 56
paste("these are walrus and elephant seal animals. The cluster is too small.")
## [1] "these are walrus and elephant seal animals. The cluster is too small."
which(k$cluster==6)
## [1] 58 59 60 61 62 63 64 65 66
paste("This cluster is mostly elk type animals. Generally good.")
## [1] "This cluster is mostly elk type animals. Generally good."
which(k$cluster==7)
## [1] 1 2 3 4 32 33 34 35 37
paste("These are wild-cats and moles. Cluster should be divided into two.")
## [1] "These are wild-cats and moles. Cluster should be divided into two."
which(k$cluster==8)
## [1] 12
paste("this is an armadillo, it is natural that this was quite different than the others.")
## [1] "this is an armadillo, it is natural that this was quite different than the others."
paste("Overall, the clustering is good. Some clusters needs improvement, we can do it maybe by increasing the amount of clusters.")
## [1] "Overall, the clustering is good. Some clusters needs improvement, we can do it maybe by increasing the amount of clusters."
df2 <- read.csv("s1.csv", header=T, sep=",")
head(df2)
## x y
## 1 664159 550946
## 2 665845 557965
## 3 597173 575538
## 4 618600 551446
## 5 635690 608046
## 6 588100 557588
colMeans(df2)
## x y
## 514937.6 494709.3
paste("We have to normalize the data since the numbers are too big to make sense of. Normalizing will allow the mean to be 0, and the variance to be 1.")
## [1] "We have to normalize the data since the numbers are too big to make sense of. Normalizing will allow the mean to be 0, and the variance to be 1."
df2 <- as.data.frame(scale(df2))
head(df2)
## x y
## 1 0.6103978 0.2384519
## 2 0.6172944 0.2682135
## 3 0.3363882 0.3427256
## 4 0.4240364 0.2405720
## 5 0.4939439 0.4805644
## 6 0.2992746 0.2666150
colMeans(df2)
## x y
## -4.381495e-17 1.983969e-17
plot(df2, main="Data without clustering")
paste("I can see 15 clusters in here, which are well separated.")
## [1] "I can see 15 clusters in here, which are well separated."
fviz_nbclust(df2, kmeans, method = "wss", k.max=18)
fviz_nbclust(df2, kmeans, method = "silhouette", k.max=18)
k2 <- kmeans(df2, centers=14)
fviz_cluster(k2, df2, main="K-means with centers=14")
paste("Problem with 4 clusters.")
## [1] "Problem with 4 clusters."
k2 <- kmeans(df2, centers=15)
fviz_cluster(k2, df2, main="K-means with centers=15")
paste("Problem with 4 clusters.")
## [1] "Problem with 4 clusters."
k2 <- kmeans(df2, centers=12)
fviz_cluster(k2, df2, main="K-means with centers=12")
paste("Problem with 6 clusters.")
## [1] "Problem with 6 clusters."
k2 <- kmeans(df2, centers=13)
fviz_cluster(k2, df2, main="K-means with centers=13")
paste("Problem with 2 clusters.")
## [1] "Problem with 2 clusters."
paste("Therefore, I will select 13 clusters for k-means. The clustering is generally better more consistently .")
## [1] "Therefore, I will select 13 clusters for k-means. The clustering is generally better more consistently ."
paste("some clusters are quite close to each other, and the data points are close to each other in each cluster too.")
## [1] "some clusters are quite close to each other, and the data points are close to each other in each cluster too."
k2$withinss
## [1] 12.196766 10.691121 115.641971 123.234195 8.158955 5.003423
## [7] 9.867617 10.778042 27.777577 12.654594 93.374137 3.258654
## [13] 11.028566
k2$size
## [1] 345 338 624 686 329 216 351 297 399 340 633 125 317
paste("Since we have 2-D data, We can set it to 4. UPDATE: after grid searching with different minPts and epsses, I decided to set it to 6. Considering our data is condensed, we can do this.")
## [1] "Since we have 2-D data, We can set it to 4. UPDATE: after grid searching with different minPts and epsses, I decided to set it to 6. Considering our data is condensed, we can do this."
K <- 6
dbscan::kNNdistplot(df2, K)
paste("I can see that the eps can be 0.08")
## [1] "I can see that the eps can be 0.08"
abline(h = 0.08, lty = 2)
db <- fpc::dbscan(df2, eps =0.09, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.09")
print(db)
## dbscan Pts=5000 MinPts=6 eps=0.09
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## border 140 15 13 5 1 13 4 11 8 12 10 17 15 4 9 12
## seed 0 264 310 1 309 323 304 313 313 648 318 309 310 342 333 314
## total 140 279 323 6 310 336 308 324 321 660 328 326 325 346 342 326
db <- fpc::dbscan(df2, eps =0.085, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.085")
print(db)
## dbscan Pts=5000 MinPts=6 eps=0.085
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## border 163 14 13 5 2 16 6 12 9 13 11 17 14 7 11 13
## seed 0 262 308 1 308 318 302 307 311 645 316 308 306 339 330 313
## total 163 276 321 6 310 334 308 319 320 658 327 325 320 346 341 326
db <- fpc::dbscan(df2, eps =0.082, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.082")
print(db)
## dbscan Pts=5000 MinPts=6 eps=0.082
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## border 188 14 13 5 2 17 8 15 8 5 13 12 18 16 7 11 6
## seed 0 259 308 1 308 317 300 304 308 321 319 313 306 302 339 328 309
## total 188 273 321 6 310 334 308 319 316 326 332 325 324 318 346 339 315
db <- fpc::dbscan(df2, eps =0.08, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.08")
print(db)
## dbscan Pts=5000 MinPts=6 eps=0.08
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## border 208 14 14 2 18 11 18 8 6 14 14 16 16 7 11 6
## seed 0 259 307 308 316 297 301 308 319 318 310 304 296 338 327 309
## total 208 273 321 310 334 308 319 316 325 332 324 320 312 345 338 315
db <- fpc::dbscan(df2, eps =0.078, MinPts = K)
fviz_cluster(db, df2, geom = "point", main = "Cluster Plot minPts=5, eps=0.082")
print(db)
## dbscan Pts=5000 MinPts=6 eps=0.078
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## border 219 18 16 2 16 11 21 11 10 17 13 4 15 19 7 12 4 8
## seed 0 255 304 308 311 296 297 305 315 314 303 4 302 292 337 325 2 307
## total 219 273 320 310 327 307 318 316 325 331 316 8 317 311 344 337 6 315
paste("Best result is from eps=0.8 with 15 clusters that make sense.")
## [1] "Best result is from eps=0.8 with 15 clusters that make sense."
paste("At minPts = 4, I tried different epsses. Best eps = 0.08, there are 20 clusters. At minPts = 5, I tried different epsses. Best eps = 0.08, there are 17 clusters. At minPts = 6, I tried different epsses. Best eps = 0.08, there are 15 clusters.")
## [1] "At minPts = 4, I tried different epsses. Best eps = 0.08, there are 20 clusters. At minPts = 5, I tried different epsses. Best eps = 0.08, there are 17 clusters. At minPts = 6, I tried different epsses. Best eps = 0.08, there are 15 clusters."
paste("Overall, best result was from when minPts=5, eps=0.08. There was less data loss, and the clusters made sense.")
## [1] "Overall, best result was from when minPts=5, eps=0.08. There was less data loss, and the clusters made sense."